library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggpubr)
library(rstatix)
library(expss)
library(DESeq2)
library(knitr) ; library(kableExtra)
# SFARI Genes
SFARI_genes = read_csv('./../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
# Load Gandal dataset
load('./../Data/preprocessedData/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
# Updates genes_info with SFARI information
genes_info = genes_info %>% left_join(SFARI_genes, by = 'ID') %>%
mutate(gene.score = ifelse(is.na(`gene-score`) & Neuronal==0, 'Others',
ifelse(is.na(`gene-score`), 'Neuronal', `gene-score`))) %>%
mutate(Group = factor(ifelse(gene.score %in% c('Neuronal','Others'), gene.score, 'SFARI'),
levels = c('SFARI', 'Neuronal', 'Others')))
# SFARI palette
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
There are 864 SFARI Genes in the expression dataset (~77%), 16117 of them have a SFARI Gene Score.
Gene count by SFARI score:
table_info = genes_info %>% apply_labels(`gene-score` = 'SFARI Gene Score', syndromic = 'Syndromic Tag',
Neuronal = 'Neuronal Function', gene.score = 'Gene Score') %>%
mutate(syndromic = as.logical(syndromic), Neuronal = as.logical(Neuronal))
cro(table_info$`gene-score`)
|  #Total | |
|---|---|
|  SFARI Gene Score | |
| Â Â Â 1Â | 144 |
| Â Â Â 2Â | 205 |
| Â Â Â 3Â | 440 |
|    #Total cases | 789 |
Gene count by Syndromic tag:
cro(table_info$syndromic)
|  #Total | |
|---|---|
|  Syndromic Tag | |
| Â Â Â FALSEÂ | 748 |
| Â Â Â TRUEÂ | 116 |
|    #Total cases | 864 |
Neuronal annotations:
1545 genes have neuronal-related annotations, 201 of these, have a SFARI score
cro(table_info$gene.score[genes_info$`gene-score` %in% as.character(c(1:3))],
list(table_info$Neuronal[genes_info$`gene-score` %in% as.character(c(1:3))], total()))
|  Neuronal Function |  #Total | |||
|---|---|---|---|---|
| Â FALSEÂ | Â TRUEÂ | Â | ||
|  Gene Score | ||||
| Â Â Â 1Â | 97 | 47 | Â | 144 |
| Â Â Â 2Â | 146 | 59 | Â | 205 |
| Â Â Â 3Â | 345 | 95 | Â | 440 |
|    #Total cases | 588 | 201 |  | 789 |
rm(table_info)
plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr)) %>%
left_join(genes_info, by='ID')
wt = plot_data %>% wilcox_test(MeanExpr~Group, p.adjust.method='BH') %>% add_x_position(x = 'group')
increase = 1
base = 15.7
pos_y_comparisons = c(base, base+increase, base)
plot_data %>% ggplot(aes(Group, MeanExpr)) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=Group)) +
stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .03) +
scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(8,7)))) +
xlab('') + ylab('Mean Expression') + #ggtitle('Mean Expression Comparison') +
scale_x_discrete(labels=c('SFARI' = 'SFARI\nGenes', 'Others' = 'Other\ngenes',
'Neuronal' = 'Neuronal\ngenes')) +
theme_minimal() + theme(legend.position='none')
The propotion of over and underexpressed genes is very simiar for each group, so we can just focus on the magnitude of the LFC independently of the sign.
genes_info %>% mutate(direction = ifelse(log2FoldChange>0, 'overexpressed', 'underexpressed')) %>%
group_by(Group, direction) %>% tally(name = 'overexpressed') %>%
filter(direction == 'overexpressed') %>% ungroup %>%
left_join(genes_info %>% group_by(Group) %>% tally(name = 'Total'), by = 'Group') %>% ungroup %>%
mutate('underexpressed' = Total - overexpressed ,
'ratio' = round(overexpressed/underexpressed,2)) %>%
dplyr::select(Group, overexpressed, underexpressed, ratio) %>%
kable %>% kable_styling(full_width = F)
| Group | overexpressed | underexpressed | ratio |
|---|---|---|---|
| SFARI | 379 | 410 | 0.92 |
| Neuronal | 673 | 671 | 1.00 |
| Others | 7194 | 6790 | 1.06 |
plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr)) %>%
left_join(genes_info, by='ID')
wt = plot_data %>% filter(log2FoldChange>0) %>%
wilcox_test(MeanExpr~Group, p.adjust.method='BH') %>% add_x_position(x = 'group')
increase = 1
base = 15.7
pos_y_comparisons = c(base, base+increase, base)
p1 = plot_data %>% filter(log2FoldChange>0) %>% ggplot(aes(Group, MeanExpr)) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=Group)) +
stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .03) +
scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(8,7)))) +
xlab('') + ylab('Mean Expression') + ggtitle('Overexpressed genes') +
scale_x_discrete(labels=c('SFARI' = 'SFARI\nGenes', 'Others' = 'Other\ngenes',
'Neuronal' = 'Neuronal\ngenes')) +
theme_minimal() + theme(legend.position='none')
wt = plot_data %>% filter(log2FoldChange<0) %>%
wilcox_test(MeanExpr~Group, p.adjust.method='BH') %>% add_x_position(x = 'group')
p2 = plot_data %>% filter(log2FoldChange<0) %>% ggplot(aes(Group, MeanExpr)) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=Group)) +
stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .03) +
scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(8,7)))) +
xlab('') + ylab('Mean Expression') + ggtitle('Underexpressed genes') +
scale_x_discrete(labels=c('SFARI' = 'SFARI\nGenes', 'Others' = 'Other\ngenes',
'Neuronal' = 'Neuronal\ngenes')) +
theme_minimal() + theme(legend.position='none')
grid.arrange(p1, p2, nrow=1)
Both with the original LFC values as well as the shrunken ones, the difference in LFC magnitude between SFARI genes and Neuronal genes is statisically significant with a p-value lower than \(10^{-4}\).
wt = genes_info %>% mutate(abs_lfc = abs(log2FoldChange),
Group = factor(Group, levels = c('SFARI','Others', 'Neuronal'))) %>%
wilcox_test(abs_lfc~Group, p.adjust.method='BH') %>% add_x_position(x = 'group')
increase = 0.04
base = 0.45
pos_y_comparisons = c(base, base + increase, base)
p1 = genes_info %>% mutate(Group = factor(Group, levels = c('SFARI','Others', 'Neuronal'))) %>%
ggplot(aes(Group, abs(log2FoldChange))) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=Group)) +
stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .005) +
scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(7,8)))) +
coord_cartesian(ylim= c(0, max(pos_y_comparisons))) +
scale_x_discrete(labels=c('SFARI' = 'SFARI\nGenes', 'Others' = 'Other\ngenes',
'Neuronal' = 'Neuronal\ngenes')) +
xlab('') + ylab('Original LFC Magnitude') + theme_minimal() + theme(legend.position='none')
wt = genes_info %>% mutate(abs_lfc = abs(shrunken_log2FoldChange),
Group = factor(Group, levels = c('SFARI','Others', 'Neuronal'))) %>%
wilcox_test(abs_lfc~Group, p.adjust.method='BH') %>% add_x_position(x = 'group')
increase = 0.03
base = 0.35
pos_y_comparisons = c(base, base + increase, base)
p2 = genes_info %>% mutate(Group = factor(Group, levels = c('SFARI','Others', 'Neuronal'))) %>%
ggplot(aes(Group, abs(shrunken_log2FoldChange))) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=Group)) +
stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .01) +
scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(7,8)))) +
coord_cartesian(ylim= c(0, max(pos_y_comparisons))) +
scale_x_discrete(labels=c('SFARI' = 'SFARI\nGenes', 'Others' = 'Other\ngenes',
'Neuronal' = 'Neuronal\ngenes')) +
xlab('') + ylab('Shrunken LFC Magnitude') + theme_minimal() + theme(legend.position='none')
grid.arrange(p1, p2, nrow = 1)
rm(increase, base, pos_y_comparisons, wt)
Full plots, without cropping out outliers
p1 = ggplotly(genes_info %>% mutate(Group = factor(Group, levels = c('SFARI','Others', 'Neuronal'))) %>%
ggplot(aes(Group, abs(log2FoldChange))) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=Group)) +
scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(7,8)))) +
scale_x_discrete(labels=c('SFARI' = 'SFARI\nGenes', 'Others' = 'Other\ngenes',
'Neuronal' = 'Neuronal\ngenes')) +
xlab('') + ylab('Original LFC Magnitude') + theme_minimal() + theme(legend.position='none'))
p2 = ggplotly(genes_info %>% mutate(Group = factor(Group, levels = c('SFARI','Others', 'Neuronal'))) %>%
ggplot(aes(Group, abs(shrunken_log2FoldChange))) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=Group)) +
scale_fill_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(7,8)))) +
scale_x_discrete(labels=c('SFARI' = 'SFARI\nGenes', 'Others' = 'Other\ngenes',
'Neuronal' = 'Neuronal\ngenes')) +
xlab('') + ylab('Shrunken LFC Magnitude') + theme_minimal() + theme(legend.position='none'))
subplot(p1, p2, nrows = 1)
genes_info %>% group_by(Group, significant) %>% tally(name = 'DE') %>% filter(significant) %>% ungroup %>%
left_join(genes_info %>% group_by(Group) %>% tally(name = 'Total'), by = 'Group') %>% ungroup %>%
mutate('prop_DE' = round(DE/Total,2)) %>% dplyr::select(Group, DE, prop_DE, Total) %>%
kable(caption = 'Proportion of DE Genes by Group') %>% kable_styling(full_width = F)
| Group | DE | prop_DE | Total |
|---|---|---|---|
| SFARI | 242 | 0.31 | 789 |
| Neuronal | 482 | 0.36 | 1344 |
| Others | 3749 | 0.27 | 13984 |
lfc_list = seq(0, 0.3, 0.01)
all_counts = data.frame('group'='All', 'n'=as.character(nrow(genes_info)))
Others_counts = data.frame('group'='Others', n=as.character(sum(genes_info$Group == 'Others')))
Neuronal_counts = data.frame('group'='Neuronal', n=as.character(sum(genes_info$Group == 'Neuronal')))
lfc_counts_all = genes_info %>% filter(Group == 'SFARI') %>% tally %>%
mutate('group'='SFARI', 'n'=as.character(n)) %>%
dplyr::select(group, n) %>%
bind_rows(Neuronal_counts, Others_counts, all_counts) %>%
mutate('lfc'=-1) %>% dplyr::select(lfc, group, n)
for(lfc in lfc_list){
# Recalculate genes_info with the new threshold (p-values change)
DE_genes = results(dds, lfcThreshold=lfc, altHypothesis='greaterAbs') %>% data.frame %>%
mutate('ID' = rownames(.)) %>%
left_join(genes_info %>% dplyr::select(ID, Neuronal, gene.score, Group), by = 'ID') %>%
filter(padj<0.05 & abs(log2FoldChange)>lfc)
# Calculate counts by groups
all_counts = data.frame('group'='All', 'n'=as.character(nrow(DE_genes)))
Others_counts = data.frame('group'='Others', n=as.character(sum(DE_genes$Group == 'Others')))
Neuronal_counts = data.frame('group'='Neuronal', n=as.character(sum(DE_genes$Group == 'Neuronal')))
lfc_counts = DE_genes %>% filter(Group == 'SFARI') %>% tally %>%
mutate('group'='SFARI', 'n'=as.character(n)) %>%
bind_rows(Neuronal_counts, Others_counts, all_counts) %>%
mutate('lfc'=lfc) %>% dplyr::select(lfc, group, n)
# Update lfc_counts_all
lfc_counts_all = lfc_counts_all %>% bind_rows(lfc_counts)
}
# Add missing entries with 0s
lfc_counts_all = expand.grid('group'=unique(lfc_counts_all$group), 'lfc'=unique(lfc_counts_all$lfc)) %>%
left_join(lfc_counts_all, by=c('group','lfc')) %>% replace(is.na(.), 0)
# Calculate percentage of each group remaining
tot_counts = genes_info %>% filter(Group == 'SFARI') %>% tally() %>%
mutate('group'='SFARI', 'tot'=n) %>% dplyr::select(group, tot) %>%
bind_rows(data.frame('group'='Neuronal', 'tot'=sum(genes_info$Group == 'Neuronal')),
data.frame('group' = 'Others', 'tot' = sum(genes_info$Group == 'Others')),
data.frame('group'='All', 'tot'=nrow(genes_info)))
lfc_counts_all = lfc_counts_all %>% filter(lfc!=-1) %>% #, group!='Others') %>%
left_join(tot_counts, by='group') %>% mutate('perc'=round(100*as.numeric(n)/tot,2))
# Plot change of number of genes
ggplotly(lfc_counts_all %>% filter(group != 'All') %>%
mutate(group = factor(group, levels = c('SFARI', 'Neuronal', 'Others'))) %>%
ggplot(aes(lfc, perc, color=group)) + geom_point(aes(id=n)) + geom_line() +
scale_color_manual(values=c('#00A4F7', SFARI_colour_hue(r=c(8,7)))) +
ylab('Percentage of DE Genes') + xlab('LFC threshold') + labs(color = 'Group') +
#ggtitle('Effect of filtering thresholds in SFARI Genes') +
theme_minimal() + theme(legend.position = 'bottom'))
rm(lfc_list, all_counts, Others_counts, Neuronal_counts, lfc_counts, lfc_counts_all, DE_genes, tot_counts)
plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr)) %>% left_join(genes_info, by='ID')
wt = plot_data %>% wilcox_test(MeanExpr~gene.score, p.adjust.method='BH') %>% add_x_position(x = 'group')
increase = 1
base = 15.5
pos_y_comparisons = c(base + c(0,1,3,5)*increase, base+c(0,2,4)*increase, base+0:1*increase, base)
plot_data %>% ggplot(aes(gene.score, MeanExpr)) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=gene.score)) +
stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .01) +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) +
xlab('') + ylab('Mean Expression') +
scale_x_discrete(labels=c('1' = 'SFARI\nScore 1', '2' = 'SFARI\nScore 2', '3' = 'SFARI\nScore 3',
'Others' = 'Other\ngenes', 'Neuronal' = 'Neuronal\ngenes')) +
theme_minimal() + theme(legend.position='none')
genes_info %>% mutate(direction = ifelse(log2FoldChange>0, 'overexpressed', 'underexpressed')) %>%
group_by(gene.score, direction) %>% tally(name = 'overexpressed') %>%
filter(direction == 'overexpressed') %>% ungroup %>%
left_join(genes_info %>% group_by(gene.score) %>% tally(name = 'Total'), by = 'gene.score') %>% ungroup %>%
mutate('underexpressed' = Total - overexpressed ,
'ratio' = round(overexpressed/underexpressed,2)) %>%
dplyr::select(gene.score, overexpressed, underexpressed, ratio) %>%
kable %>% kable_styling(full_width = F)
| gene.score | overexpressed | underexpressed | ratio |
|---|---|---|---|
| 1 | 66 | 78 | 0.85 |
| 2 | 102 | 103 | 0.99 |
| 3 | 211 | 229 | 0.92 |
| Neuronal | 673 | 671 | 1.00 |
| Others | 7194 | 6790 | 1.06 |
plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr)) %>%
left_join(genes_info, by='ID')
wt = plot_data %>% filter(log2FoldChange>0) %>%
wilcox_test(MeanExpr~gene.score, p.adjust.method='BH') %>% add_x_position(x = 'group')
increase = 1
base = 15.5
pos_y_comparisons = c(base + c(0,1,3,5)*increase, base+c(0,2,4)*increase, base+0:1*increase, base)
p1 = plot_data %>% filter(log2FoldChange>0) %>% ggplot(aes(gene.score, MeanExpr)) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=gene.score)) +
stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .03) +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) +
xlab('') + ylab('Mean Expression') + ggtitle('Overexpressed genes') +
scale_x_discrete(labels=c('1' = 'SFARI\nScore 1', '2' = 'SFARI\nScore 2', '3' = 'SFARI\nScore 3',
'Others' = 'Other\ngenes', 'Neuronal' = 'Neuronal\ngenes')) +
theme_minimal() + theme(legend.position='none')
wt = plot_data %>% filter(log2FoldChange<0) %>%
wilcox_test(MeanExpr~gene.score, p.adjust.method='BH') %>% add_x_position(x = 'group')
p2 = plot_data %>% filter(log2FoldChange<0) %>% ggplot(aes(gene.score, MeanExpr)) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=gene.score)) +
stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .03) +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) +
xlab('') + ylab('Mean Expression') + ggtitle('Underexpressed genes') +
scale_x_discrete(labels=c('1' = 'SFARI\nScore 1', '2' = 'SFARI\nScore 2', '3' = 'SFARI\nScore 3',
'Others' = 'Other\ngenes', 'Neuronal' = 'Neuronal\ngenes')) +
theme_minimal() + theme(legend.position='none')
grid.arrange(p1, p2, nrow=1)
Both with the original LFC values as well as the shrunken ones, the difference in LFC magnitude between SFARI genes and Neuronal genes is statisically significant with a p-value lower than \(10^{-4}\).
wt = genes_info %>% mutate(abs_lfc = abs(log2FoldChange),
Group = factor(gene.score, levels = c('1','2','3','Others','Neuronal'))) %>%
wilcox_test(abs_lfc~Group, p.adjust.method='BH') %>% add_x_position(x = 'group')
increase = 0.04
base = 0.45
pos_y_comparisons = c(base + c(0,1,3,5)*increase, base+c(0,2,4)*increase, base+0:1*increase, base)
p1 = genes_info %>% mutate(gene.score = factor(gene.score, levels = c('1','2','3','Others','Neuronal'))) %>%
ggplot(aes(gene.score, abs(log2FoldChange))) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=gene.score)) +
stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .005) +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,7,8))) +
coord_cartesian(ylim= c(0, max(pos_y_comparisons))) +
scale_x_discrete(labels=c('1' = 'SFARI\nScore 1', '2' = 'SFARI\nScore 2', '3' = 'SFARI\nScore 3',
'Neuronal' = 'Neuronal\ngenes','Others' = 'Other\ngenes')) +
xlab('') + ylab('Original LFC Magnitude') + theme_minimal() + theme(legend.position='none')
wt = genes_info %>% mutate(abs_lfc = abs(shrunken_log2FoldChange),
Group = factor(gene.score, levels = c('1','2','3','Others', 'Neuronal'))) %>%
wilcox_test(abs_lfc~Group, p.adjust.method='BH') %>% add_x_position(x = 'group')
increase = 0.03
base = 0.35
pos_y_comparisons = c(base + c(0,1,3,5)*increase, base+c(0,2,4)*increase, base+0:1*increase, base)
p2 = genes_info %>% mutate(gene.score = factor(gene.score, levels = c('1','2','3','Others','Neuronal'))) %>%
ggplot(aes(gene.score, abs(shrunken_log2FoldChange))) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=gene.score)) +
stat_pvalue_manual(wt, y.position = pos_y_comparisons, tip.length = .01) +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,7,8))) +
coord_cartesian(ylim= c(0, max(pos_y_comparisons))) +
scale_x_discrete(labels=c('1' = 'SFARI\nScore 1', '2' = 'SFARI\nScore 2', '3' = 'SFARI\nScore 3',
'Neuronal' = 'Neuronal\ngenes', 'Others' = 'Other\ngenes')) +
xlab('') + ylab('Shrunken LFC Magnitude') + theme_minimal() + theme(legend.position='none')
grid.arrange(p1, p2, nrow = 1)
rm(increase, base, pos_y_comparisons, wt)
Full plots, without cropping out outliers
p1 = genes_info %>% mutate(gene.score = factor(gene.score, levels = c('1','2','3','Others','Neuronal'))) %>%
ggplot(aes(gene.score, abs(log2FoldChange))) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=gene.score)) +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,7,8))) +
scale_x_discrete(labels=c('1' = 'SFARI\nScore 1', '2' = 'SFARI\nScore 2', '3' = 'SFARI\nScore 3',
'Neuronal' = 'Neuronal\ngenes','Others' = 'Other\ngenes')) +
xlab('') + ylab('Original LFC Magnitude') + theme_minimal() + theme(legend.position='none')
p2 = genes_info %>% mutate(gene.score = factor(gene.score, levels = c('1','2','3','Others','Neuronal'))) %>%
ggplot(aes(gene.score, abs(shrunken_log2FoldChange))) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3, aes(fill=gene.score)) +
scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,7,8))) +
scale_x_discrete(labels=c('1' = 'SFARI\nScore 1', '2' = 'SFARI\nScore 2', '3' = 'SFARI\nScore 3',
'Neuronal' = 'Neuronal\ngenes', 'Others' = 'Other\ngenes')) +
xlab('') + ylab('Shrunken LFC Magnitude') + theme_minimal() + theme(legend.position='none')
subplot(ggplotly(p1), ggplotly(p2), nrows=1)
rm(increase, base, pos_y_comparisons, wt)
## Warning in rm(increase, base, pos_y_comparisons, wt): object 'increase' not
## found
## Warning in rm(increase, base, pos_y_comparisons, wt): object 'base' not found
## Warning in rm(increase, base, pos_y_comparisons, wt): object 'pos_y_comparisons'
## not found
## Warning in rm(increase, base, pos_y_comparisons, wt): object 'wt' not found
lfc_list = seq(0, 0.3, 0.01)
all_counts = data.frame('group'='All', 'n'=as.character(nrow(genes_info)))
Others_counts = data.frame('group'='Others', n=as.character(sum(genes_info$gene.score == 'Others')))
Neuronal_counts = data.frame('group'='Neuronal', n=as.character(sum(genes_info$gene.score == 'Neuronal')))
lfc_counts_all = genes_info %>% group_by(`gene-score`) %>% filter(`gene-score` != 'Others') %>% tally %>%
mutate('group'=as.factor(`gene-score`), 'n'=as.character(n)) %>%
dplyr::select(group, n) %>%
bind_rows(Neuronal_counts, Others_counts, all_counts) %>%
mutate('lfc'=-1) %>% dplyr::select(lfc, group, n)
for(lfc in lfc_list){
# Recalculate genes_info with the new threshold (p-values change)
DE_genes = results(dds, lfcThreshold=lfc, altHypothesis='greaterAbs') %>% data.frame %>%
mutate('ID'=rownames(.)) %>%
left_join(genes_info %>% dplyr::select(ID, Neuronal, gene.score), by='ID') %>%
filter(padj<0.05 & abs(log2FoldChange)>lfc)
# Calculate counts by groups
all_counts = data.frame('group'='All', 'n'=as.character(nrow(DE_genes)))
Others_counts = data.frame('group'='Others', n=as.character(sum(DE_genes$gene.score == 'Others')))
Neuronal_counts = data.frame('group'='Neuronal', n=as.character(sum(DE_genes$gene.score == 'Neuronal')))
lfc_counts = DE_genes %>% group_by(gene.score) %>% tally %>%
mutate('group'=gene.score, 'n'=as.character(n)) %>%
bind_rows(Neuronal_counts, all_counts) %>%
mutate('lfc'=lfc) %>% dplyr::select(lfc, group, n)
# Update lfc_counts_all
lfc_counts_all = lfc_counts_all %>% bind_rows(lfc_counts)
}
# Add missing entries with 0s
lfc_counts_all = expand.grid('group'=unique(lfc_counts_all$group), 'lfc'=unique(lfc_counts_all$lfc)) %>%
left_join(lfc_counts_all, by=c('group','lfc')) %>% replace(is.na(.), 0)
# Calculate percentage of each group remaining
tot_counts = genes_info %>% group_by(`gene-score`) %>% tally() %>% filter(`gene-score`!='Others') %>%
mutate('group'=factor(`gene-score`), 'tot'=n) %>% dplyr::select(group, tot) %>%
bind_rows(data.frame('group'='Neuronal', 'tot'=sum(genes_info$gene.score == 'Neuronal')),
data.frame('group'='Others', 'tot'=sum(genes_info$gene.score == 'Others')),
data.frame('group'='All', 'tot'=nrow(genes_info)))
lfc_counts_all = lfc_counts_all %>% filter(lfc!=-1) %>% #, group!='Others') %>%
left_join(tot_counts, by='group') %>% mutate('perc'=round(100*as.numeric(n)/tot,2))
# Plot change of number of genes
ggplotly(lfc_counts_all %>% filter(group != 'All') %>%
mutate(group = ifelse(group %in% c('1','2','3'), paste0('Score ',group), group)) %>%
mutate(group = factor(group, levels = c('Score 1', 'Score 2', 'Score 3', 'Neuronal', 'Others'))) %>%
ggplot(aes(lfc, perc, color=group)) +
geom_point(aes(id=n)) + geom_line() + scale_color_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) +
ylab('Percentage of DE Genes') + xlab('LFC threshold') + labs(color = 'Group') +
#ggtitle('Effect of filtering thresholds by SFARI score') +
#guides(color=guide_legend(nrow=2,byrow=TRUE) + # for breaking the legend into two rows
theme_minimal() + theme(legend.position = 'bottom'))
rm(lfc_list, all_counts, Neuronal_counts, lfc_counts_all, lfc, lfc_counts, lfc_counts_all, tot_counts,
lfc_counts_all, Others_counts)
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] kableExtra_1.1.0 knitr_1.32
## [3] DESeq2_1.24.0 SummarizedExperiment_1.14.1
## [5] DelayedArray_0.10.0 BiocParallel_1.18.1
## [7] matrixStats_0.56.0 Biobase_2.44.0
## [9] GenomicRanges_1.36.1 GenomeInfoDb_1.20.0
## [11] IRanges_2.18.3 S4Vectors_0.22.1
## [13] BiocGenerics_0.30.0 expss_0.10.2
## [15] rstatix_0.6.0 ggpubr_0.2.5
## [17] magrittr_2.0.1 GGally_1.5.0
## [19] gridExtra_2.3 viridis_0.5.1
## [21] viridisLite_0.3.0 RColorBrewer_1.1-2
## [23] plotly_4.9.2 glue_1.4.2
## [25] reshape2_1.4.4 forcats_0.5.0
## [27] stringr_1.4.0 dplyr_1.0.1
## [29] purrr_0.3.4 readr_1.3.1
## [31] tidyr_1.1.0 tibble_3.0.3
## [33] ggplot2_3.3.2 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 ggsignif_0.6.0 ellipsis_0.3.1
## [4] rio_0.5.16 htmlTable_1.13.3 XVector_0.24.0
## [7] base64enc_0.1-3 fs_1.5.0 rstudioapi_0.11
## [10] farver_2.0.3 bit64_4.0.5 AnnotationDbi_1.46.1
## [13] fansi_0.4.1 lubridate_1.7.4 xml2_1.2.5
## [16] splines_3.6.3 geneplotter_1.62.0 Formula_1.2-3
## [19] jsonlite_1.7.2 annotate_1.62.0 broom_0.7.0
## [22] cluster_2.1.0 dbplyr_1.4.2 png_0.1-7
## [25] compiler_3.6.3 httr_1.4.2 backports_1.1.8
## [28] assertthat_0.2.1 Matrix_1.2-18 lazyeval_0.2.2
## [31] cli_2.0.2 acepack_1.4.1 htmltools_0.5.1.1
## [34] tools_3.6.3 gtable_0.3.0 GenomeInfoDbData_1.2.1
## [37] Rcpp_1.0.6 carData_3.0-3 cellranger_1.1.0
## [40] vctrs_0.3.2 crosstalk_1.1.0.1 xfun_0.22
## [43] openxlsx_4.1.4 rvest_0.3.5 lifecycle_0.2.0
## [46] XML_3.99-0.3 zlibbioc_1.30.0 scales_1.1.1
## [49] hms_0.5.3 yaml_2.2.1 curl_4.3
## [52] memoise_1.1.0 rpart_4.1-15 RSQLite_2.2.0
## [55] reshape_0.8.8 latticeExtra_0.6-29 stringi_1.5.3
## [58] highr_0.8 genefilter_1.66.0 checkmate_2.0.0
## [61] zip_2.0.4 rlang_0.4.10 pkgconfig_2.0.3
## [64] bitops_1.0-6 evaluate_0.14 lattice_0.20-41
## [67] labeling_0.3 htmlwidgets_1.5.1 bit_4.0.4
## [70] tidyselect_1.1.0 plyr_1.8.6 R6_2.5.0
## [73] generics_0.0.2 Hmisc_4.4-0 DBI_1.1.0
## [76] pillar_1.4.6 haven_2.2.0 foreign_0.8-76
## [79] withr_2.2.0 nnet_7.3-14 survival_3.2-7
## [82] abind_1.4-5 RCurl_1.98-1.2 modelr_0.1.6
## [85] crayon_1.3.4 car_3.0-7 rmarkdown_2.7
## [88] jpeg_0.1-8.1 locfit_1.5-9.4 grid_3.6.3
## [91] readxl_1.3.1 data.table_1.12.8 blob_1.2.1
## [94] webshot_0.5.2 reprex_0.3.0 digest_0.6.27
## [97] xtable_1.8-4 munsell_0.5.0